libname rct '';
libname frame '';
libname frame20 '';
libname num '';
libname itin '';
libname numcit '';
libname irs17 '';
libname irs18 '';
libname maf '';
libname centest '';
libname tab '';

*this is the housing unit-level 2019 Census Test file

data rct.centest2019_hu_analysis_file (drop=mafid rename=(maf=mafid));
set centest.centest2019_hu_analysis_file;
maf=input(mafid,9.);
run;

*this is the person-level 2019 Census Test file

data rct.centest2019_per_analysis_file (drop=mafid rename=(maf=mafid));
set centest.centest2019_per_analysis_file;
maf=input(mafid,9.);
run;

*here we get the household addresses (MAFID) in the 2019 Census Test 

data mafid;
set rct.rct_mafid_list (keep=mafid_rct rename=(mafid_rct=mafid));
run;

proc sort nodupkey data=mafid;
by mafid;
run;

/*besthisp=1 is Hispanic, besthisp=2 is non=Hispanic. bestrace=1 is White alone,
=2 is Black alone, =3 is AIAN alone, =4 is Asian alone, =5 is NHPI alone, =6 is Some 
Other Race alone, =7 is Two or More Races*/

/*here we obtain data from the 2019 Demographic Frame. We choose the person's MAFID with the highest 
person-place probability from the random forest model. We drop people with age over 114, who must be deceased.*/

data frame (drop=rf_best);
set frame.demoframe2019_t26_v1 (keep=pik mafid num_mafids rf_best bestrace besthisp age sex);
if rf_best=1;
if age<115 | age=.;
run;

proc sort nodupkey data=frame;
by pik;
run;

*this dataset is people who died before July 1, 2019;

data dod (keep=pik);
set num.cnum_2020q2_update (keep=pik dodcc dodyy dodmm);
dodc=input(dodcc,2.);
dody=input(dodyy,2.);
dodm=input(dodmm,2.);
dod_year=dodc*100+dody;
if (dod_year>0 & dod_year<=2018) | (dod_year=2019 & dodm<=6);
run;

proc sort nodupkey data=dod;
by pik;
run;

*this dataset is people with ITINs;

data itin;
set itin.pik_itin_universe;
run;

proc sort data=itin;
by pik;
run;

/*These data come from the Numident. cit_year is the first year the person was a citizen.
noncit_year is the last year the person was a noncitizen. pobst is the place of birth state or 
country. fb_num=1 if the person is foreign-born.;
*/

data numcit (drop=dod_year pobst);
set numcit.num_ffu_2020q4 (keep=pik pobst dod_year fb_num cit_year noncit_year citizen);
if dod_year=. | dod_year>=2019;
if fb_num=1 & pobst in ("BZ","CR","SV","GT","HN","MX","NI","PA","BO","CO","EC","VE") then latin=1;
else latin=0;
run;

/*here we remove any remaining deceased people from the demographic frame, create
an ITIN indicator variable, create a naturalized citizen variable, and a 
noncitizen with SSN variable.*/

data frame;
merge frame (in=a) dod (in=b) itin (in=c) numcit;
by pik;
if a=1 & b=0;
if c=1 then itin=1;
else itin=0;
if fb_num=1 & cit_year>0 & cit_year<=2019 then nat_citizen=1;
else nat_citizen=0;
if noncit_year>=2019 & noncit_year<=2020 then noncit_ssn=1;
else noncit_ssn=0;
run;

proc sort data=frame;
by mafid;
run;

/*here we obtain variables from TY 2017 and 2018 IRS 1040 data. tmi is 
total monetary income. post is the week when the IRS received the return;*/

data irs (keep=pik mafid tmi post married_filing_jointly
unmarried_head);
set irs17.file (keep=var var var var mafid)
irs18.file (keep=var var var var mafid);
if mafid~=.;
tmi=input(var,12.);
post=input(var,6.);
if var="value" then married_filing_jointly=1;
else married_filing_jointly=0;
if var="value" then unmarried_head=1;
else unmarried_head=0;
run;

proc sort data=irs;
by mafid pik;
run;

proc means noprint data=irs max;
by mafid pik;
var post;
output out=max_irs (keep=pik mafid max_post) max=max_post;
run;

/*We convert 2017 income into 2018 dollars using the 
publicly available BLS annual CPI
for all urban consumers between 2017 and 2018*/

data irs (drop=post max_post);
merge irs max_irs;
by mafid pik;
if post=max_post;
if post<201900 then tmi=tmi*251.107/245.12;
run;

/*here we calculate whether the MAFID has a return that is married filing jointly
or with an unmarried head of household, and we sum the total monetary income across returns;*/

proc means noprint data=irs sum max;
by mafid;
var tmi married_filing_jointly unmarried_head;
output out=irs1 (keep=mafid tot_tmi married_filing_jointly unmarried_head) sum=tot_tmi married_filing_jointly_sum unmarried_head_sum max=tmi_max married_filing_jointly
unmarried_head;
run;

data irs1;
set irs1;
p=1;
run;

*here we calculate 20th, 40th, 60th, and 80th percentiles of total monetary income;

proc means noprint data=irs1 p20 p40 p60 p80;
by p;
var tot_tmi;
output out=tmi (keep=p p20 p40 p60 p80) p20=p20
p40=p40 p60=p60 p80=p80;
run;

*here we create household income 20-percentile categories;

data irs1 (keep=mafid income_q married_filing_jointly
unmarried_head);
merge irs1 tmi;
by p;
if tot_tmi<=p20 then income_q=1;
if tot_tmi>p20 & tot_tmi<=p40 then income_q=2;
if tot_tmi>p40 & tot_tmi<=p60 then income_q=3;
if tot_tmi>p60 & tot_tmi<=p80 then income_q=4;
if tot_tmi>p80 then income_q=5;
run;

*here we obtain housing structure type from the 2020 Census final tabulation MAF extract;

data maf;
set maf.mafx2020_cen20fintab_us (keep=mafid hutyp);
run;

proc sort data=maf;
by mafid;
run;

*here we create indicators for different housing structure types;

data rct.person_ar (drop=hutyp);
merge frame (in=a) mafid (in=b) irs1 (in=c) maf;
by mafid;
if a=1 & b=1;
if c=0 then no_irs=1;
else no_irs=0;
if hutyp="M" then multiunit=1;
else multiunit=0;
if hutyp="S" then singleunit=1;
else singleunit=0;
if hutyp="O" then otherunit=1;
else otherunit=0;
if hutyp="T" then mobileunit=1;
else mobileunit=0;
run;

data hs;
set rct.person_ar (keep=pik mafid);
p=1;
run;

*here we count the number of people in the demographic frame in the housing structure;

proc means noprint data=hs sum;
by mafid;
var p;
output out=hh_size (keep=mafid hh_size) sum=hh_size;
run;

/*here we create a categorical variable for household size, the noncitizen 
with SSN variable, the U.S.-born variable, race, ethnicity, and interactions
between U.S.-born and race and ethnicity;*/

data rct.person_ar;
merge rct.person_ar hh_size;
by mafid;
if hh_size=1 then hh_size_1=1;
else hh_size_1=0;
if hh_size=2 then hh_size_2=1;
else hh_size_2=0;
if hh_size=3 then hh_size_3=1;
else hh_size_3=0;
if hh_size=4 then hh_size_4=1;
else hh_size_4=0;
if hh_size>=5 then hh_size_5p=1;
else hh_size_5p=0;
if noncit_year>=2019 & noncit_year<=2020 then noncit_ssn=1;
else noncit_ssn=0;
if cit_year>0 & cit_year<=2019 & fb_num=0 then us_born=1;
else us_born=0;
if besthisp=1 then hisp=1;
else hisp=0;
if bestrace=1 & hisp=0 then white=1;
else white=0;
if bestrace=2 & hisp=0 then black=1;
else black=0;
if bestrace=3 & hisp=0 then aian=1;
else aian=0;
if bestrace=4 & hisp=0 then asian=1;
else asian=0;
if bestrace=5 & hisp=0 then nhpi=1;
else nhpi=0;
if bestrace=6 & hisp=0 then some_other_race=1;
else some_other_race=0;
if bestrace=7 & hisp=0 then multirace=1;
else multirace=0;
hisp_us_born=hisp*us_born;
white_us_born=white*us_born;
black_us_born=black*us_born;
aian_us_born=aian*us_born;
asian_us_born=asian*us_born;
nhpi_us_born=nhpi*us_born;
sor_us_born=some_other_race*us_born;
multirace_us_born=multirace*us_born;
run;

/*here we create age group categories, an indicator for having missing values for 
race or ethnicity for U.S.-born people, and a noncitizen indicator;*/

data frame1;
set rct.person_ar;
if age>=0 & age<=4 then age_0_4=1;
else age_0_4=0;
if age>=5 & age<=17 then age_5_17=1;
else age_5_17=0;
if age>=18 & age<=24 then age_18_24=1;
else age_18_24=0;
if age>=25 & age<=44 then age_25_44=1;
else age_25_44=0;
if age>=45 & age<=64 then age_45_64=1;
else age_45_64=0;
if age>=65 & age<=114 then age_65p=1;
else age_65p=0;
if (besthisp=. | bestrace=.) & fb_num~=1 & itin=0 then miss=1;
else miss=0;
if noncit_ssn=1 | itin=1 then noncitizen=1;
else noncitizen=0;
run;

/*here we calculate whether there is at least one person in the MAFID
with the characteristic;*/

proc means noprint data=frame1 max;
by mafid;
var age_0_4 age_5_17 age_18_24 age_25_44
age_45_64 age_65p age itin noncit_ssn nat_citizen latin besthisp bestrace
hisp white black aian asian nhpi some_other_race multirace
hisp_us_born white_us_born black_us_born aian_us_born asian_us_born nhpi_us_born
sor_us_born multirace_us_born miss noncitizen; 
output out=age (keep=mafid age_0_4 age_5_17 age_18_24 age_25_44
age_45_64 age_65p max_age max_itin max_noncit_ssn max_nat_citizen max_latin 
max_besthisp max_bestrace max_hisp max_white max_black max_aian max_asian 
max_nhpi max_sor max_multirace max_hisp_us_born max_white_us_born max_black_us_born max_aian_us_born
max_asian_us_born max_nhpi_us_born max_sor_us_born max_multirace_us_born max_miss max_noncitizen) 
max=age_0_4 age_5_17 age_18_24 age_25_44 age_45_64 age_65p max_age max_itin 
max_noncit_ssn max_nat_citizen max_latin max_besthisp max_bestrace
max_hisp max_white max_black max_aian max_asian 
max_nhpi max_sor max_multirace max_hisp_us_born max_white_us_born max_black_us_born max_aian_us_born
max_asian_us_born max_nhpi_us_born max_sor_us_born max_multirace_us_born max_miss max_noncitizen;
run;

/*Here we calculate the minimum value for these variables within the MAFID. This is used 
for determining whether all people in the household have the characteristic;*/

proc means noprint data=frame1 min;
by mafid;
var no_irs noncitizen itin; 
output out=race (keep=mafid min_no_irs min_noncitizen min_itin) min=min_besthisp min_bestrace
min_no_irs min_noncitizen min_itin;
run;

/*Here we calculate the average number of MAFIDs people in the MAFID have within the Demographic
Frame (including MAFIDs other than their highest-probability one;*/

proc means noprint data=frame1 mean;
by mafid;
var num_mafids; 
output out=num_mafids (keep=mafid mean_num_mafids) mean=mean_num_mafids;
run;

/*Here we calculate indicators for unmarried female head of household, all noncitizens,
mixed citizen and noncitizen households, all ITIN households, and mixed ITIN and SSN
households;*/ 

data rct.person_ar (drop=max_age max_bestrace max_besthisp min_bestrace min_besthisp
min_noncitizen min_itin);
merge rct.person_ar age race num_mafids;
by mafid;
if age=max_age & unmarried_head=1 & sex="F" then female_unmarried_head=1;
else female_unmarried_head=0;
if max_noncitizen=1 & min_noncitizen=1 then all_noncitizen=1;
else all_noncitizen=0;
if max_noncitizen=1 & min_noncitizen=0 then mixed_citizenship=1;
else mixed_citizenship=0;
if max_itin=1 & min_itin=1 then all_itin=1;
else all_itin=0;
if max_itin=1 & min_itin=0 then mixed_itin=1;
else mixed_itin=0;
run;

*Here we create a housing unit-level dataset from the person-level dataset;

data rct.mafid_ar;
set rct.person_ar (keep=mafid mean_num_mafids single_filer_other single_filer married_filing_jointly
married_filing_separately unmarried_head surviving_spouse husband_wife_exemption head_child 
income_q multiunit singleunit otherunit mobileunit hh_size hh_size_1 hh_size_2 hh_size_3 hh_size_4
hh_size_5p age_0_4 age_5_17 age_18_24 age_25_44 age_45_64 age_65p max_itin max_noncit_ssn 
max_nat_citizen max_latin max_hisp max_white max_black max_aian max_asian max_nhpi max_sor max_multirace 
min_no_irs female_unmarried_head married_filing_jointly same_race_hisp max_hisp_us_born max_white_us_born 
max_black_us_born max_aian_us_born max_asian_us_born max_nhpi_us_born max_sor_us_born 
max_multirace_us_born max_miss max_noncitizen all_noncitizen mixed_citizenship all_itin mixed_itin 
rename=(mean_num_mafids=num_mafids 
max_itin=itin max_noncit_ssn=noncit_ssn max_nat_citizen=nat_citizen max_latin=latin max_hisp=hisp 
max_white=white max_black=black max_aian=aian max_asian=asian max_nhpi=nhpi max_sor=some_other_race 
max_multirace=multirace max_hisp_us_born=hisp_us_born max_white_us_born=white_us_born 
max_black_us_born=black_us_born max_aian_us_born=aian_us_born max_asian_us_born=asian_us_born 
max_nhpi_us_born=nhpi_us_born max_sor_us_born=sor_us_born max_multirace_us_born=multirace_us_born 
min_no_irs=no_irs max_miss=miss max_noncitizen=noncitizen));
run;

proc sort nodupkey data=rct.mafid_ar;
by mafid;
run;

/*This selects the 2019 Census Test housing structures where at least one 
person is reported to usually live elsewhere;*/

data rct.ulive (keep=cmid);
set rct.rpernper (keep=cmid name value);
if name="E_ULIVE" & value~="" & value~="1";
run;

proc sort nodupkey data=rct.ulive;
by cmid;
run;
